1. Data Processing
# Load the Data
knitr::opts_knit$set(root.dir = '/Users/Faiz Muthi/Downloads/data')
customers_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/customers_dataset.csv', sep=",")
products_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/products_dataset.csv', sep=",")
geolocation_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/geolocation_dataset.csv', sep=",")
order_items_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_items_dataset.csv', sep=",")
order_reviews_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_reviews_dataset.csv', sep=",")
sellers_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/sellers_dataset.csv', sep=",")
order_payments_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_payments_dataset.csv', sep=",")
orders_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/orders_dataset.csv', sep=",")
product_category_name_translation <- read.csv('/Users/Faiz Muthi/Downloads/data/product_category_name_translation.csv', sep=",")
# Check the structure of datasets
str(customers_dataset)
## 'data.frame': 99441 obs. of 5 variables:
## $ customer_id : chr "06b8999e2fba1a1fbc88172c00ba8bc7" "18955e83d337fd6b2def6b18a428ac77" "4e7b3e00288586ebd08712fdd0374a03" "b2b6027bc5c5109e529d4dc6358b12c3" ...
## $ customer_unique_id : chr "861eff4711a542e4b93843c6dd7febb0" "290c77bc529b7ac935b93aa66c333dc3" "060e732b5b29e8181a18229c7b0b2b5e" "259dac757896d24d7702b9acbbff3f3c" ...
## $ customer_zip_code_prefix: int 14409 9790 1151 8775 13056 89254 4534 35182 81560 30575 ...
## $ customer_city : chr "franca" "sao bernardo do campo" "sao paulo" "mogi das cruzes" ...
## $ customer_state : chr "SP" "SP" "SP" "SP" ...
str(products_dataset)
## 'data.frame': 32951 obs. of 9 variables:
## $ product_id : chr "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
## $ product_category_name : chr "perfumaria" "artes" "esporte_lazer" "bebes" ...
## $ product_name_lenght : int 40 44 46 27 37 60 56 56 57 36 ...
## $ product_description_lenght: int 287 276 250 261 402 745 1272 184 163 1156 ...
## $ product_photos_qty : int 1 1 1 1 4 1 4 2 1 1 ...
## $ product_weight_g : int 225 1000 154 371 625 200 18350 900 400 600 ...
## $ product_length_cm : int 16 30 18 26 20 38 70 40 27 17 ...
## $ product_height_cm : int 10 18 9 4 17 5 24 8 13 10 ...
## $ product_width_cm : int 14 20 15 26 13 11 44 40 17 12 ...
str(geolocation_dataset)
## 'data.frame': 1000163 obs. of 5 variables:
## $ geolocation_zip_code_prefix: int 1037 1046 1046 1041 1035 1012 1047 1013 1029 1011 ...
## $ geolocation_lat : num -23.5 -23.5 -23.5 -23.5 -23.5 ...
## $ geolocation_lng : num -46.6 -46.6 -46.6 -46.6 -46.6 ...
## $ geolocation_city : chr "sao paulo" "sao paulo" "sao paulo" "sao paulo" ...
## $ geolocation_state : chr "SP" "SP" "SP" "SP" ...
str(order_items_dataset)
## 'data.frame': 112650 obs. of 7 variables:
## $ order_id : chr "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
## $ order_item_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ product_id : chr "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
## $ seller_id : chr "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
## $ shipping_limit_date: chr "2017-09-19 09:45:35" "2017-05-03 11:05:13" "2018-01-18 14:48:30" "2018-08-15 10:10:18" ...
## $ price : num 58.9 239.9 199 13 199.9 ...
## $ freight_value : num 13.3 19.9 17.9 12.8 18.1 ...
str(order_reviews_dataset)
## 'data.frame': 99224 obs. of 7 variables:
## $ review_id : chr "7bc2406110b926393aa56f80a40eba40" "80e641a11e56f04c1ad469d5645fdfde" "228ce5500dc1d8e020d8d1322874b6f0" "e64fb393e7b32834bb789ff8bb30750e" ...
## $ order_id : chr "73fc7af87114b39712e6da79b0a377eb" "a548910a1c6147796b98fdf73dbeba33" "f9e4b658b201a9f2ecdecbb34bed034b" "658677c97b385a9be170737859d3511b" ...
## $ review_score : int 4 5 5 5 5 1 5 5 5 4 ...
## $ review_comment_title : chr "" "" "" "" ...
## $ review_comment_message : chr "" "" "" "Recebi bem antes do prazo estipulado." ...
## $ review_creation_date : chr "2018-01-18 00:00:00" "2018-03-10 00:00:00" "2018-02-17 00:00:00" "2017-04-21 00:00:00" ...
## $ review_answer_timestamp: chr "2018-01-18 21:46:59" "2018-03-11 03:05:13" "2018-02-18 14:36:24" "2017-04-21 22:02:06" ...
str(sellers_dataset)
## 'data.frame': 3095 obs. of 4 variables:
## $ seller_id : chr "3442f8959a84dea7ee197c632cb2df15" "d1b65fc7debc3361ea86b5f14c68d2e2" "ce3ad9de960102d0677a81f5d0bb7b2d" "c0f3eea2e14555b6faeea3dd58c1b1c3" ...
## $ seller_zip_code_prefix: int 13023 13844 20031 4195 12914 20920 55325 16304 1529 80310 ...
## $ seller_city : chr "campinas" "mogi guacu" "rio de janeiro" "sao paulo" ...
## $ seller_state : chr "SP" "SP" "RJ" "SP" ...
str(order_payments_dataset)
## 'data.frame': 103886 obs. of 5 variables:
## $ order_id : chr "b81ef226f3fe1789b1e8b2acac839d17" "a9810da82917af2d9aefd1278f1dcfa0" "25e8ea4e93396b6fa0d3dd708e76c1bd" "ba78997921bbcdc1373bb41e913ab953" ...
## $ payment_sequential : int 1 1 1 1 1 1 1 1 1 1 ...
## $ payment_type : chr "credit_card" "credit_card" "credit_card" "credit_card" ...
## $ payment_installments: int 8 1 1 8 2 2 1 3 6 1 ...
## $ payment_value : num 99.3 24.4 65.7 107.8 128.4 ...
str(orders_dataset)
## 'data.frame': 99441 obs. of 8 variables:
## $ order_id : chr "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
## $ customer_id : chr "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
## $ order_status : chr "delivered" "delivered" "delivered" "delivered" ...
## $ order_purchase_timestamp : chr "2017-10-02 10:56:33" "2018-07-24 20:41:37" "2018-08-08 08:38:49" "2017-11-18 19:28:06" ...
## $ order_approved_at : chr "2017-10-02 11:07:15" "2018-07-26 03:24:27" "2018-08-08 08:55:23" "2017-11-18 19:45:59" ...
## $ order_delivered_carrier_date : chr "2017-10-04 19:55:00" "2018-07-26 14:31:00" "2018-08-08 13:50:00" "2017-11-22 13:39:59" ...
## $ order_delivered_customer_date: chr "2017-10-10 21:25:13" "2018-08-07 15:27:45" "2018-08-17 18:06:29" "2017-12-02 00:28:42" ...
## $ order_estimated_delivery_date: chr "2017-10-18 00:00:00" "2018-08-13 00:00:00" "2018-09-04 00:00:00" "2017-12-15 00:00:00" ...
str(product_category_name_translation)
## 'data.frame': 71 obs. of 2 variables:
## $ product_category_name : chr "beleza_saude" "informatica_acessorios" "automotivo" "cama_mesa_banho" ...
## $ product_category_name_english: chr "health_beauty" "computers_accessories" "auto" "bed_bath_table" ...
# Summary statistics
summary(customers_dataset)
## customer_id customer_unique_id customer_zip_code_prefix
## Length:99441 Length:99441 Min. : 1003
## Class :character Class :character 1st Qu.:11347
## Mode :character Mode :character Median :24416
## Mean :35137
## 3rd Qu.:58900
## Max. :99990
## customer_city customer_state
## Length:99441 Length:99441
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summary(products_dataset)
## product_id product_category_name product_name_lenght
## Length:32951 Length:32951 Min. : 5.00
## Class :character Class :character 1st Qu.:42.00
## Mode :character Mode :character Median :51.00
## Mean :48.48
## 3rd Qu.:57.00
## Max. :76.00
## NA's :610
## product_description_lenght product_photos_qty product_weight_g
## Min. : 4.0 Min. : 1.000 Min. : 0
## 1st Qu.: 339.0 1st Qu.: 1.000 1st Qu.: 300
## Median : 595.0 Median : 1.000 Median : 700
## Mean : 771.5 Mean : 2.189 Mean : 2276
## 3rd Qu.: 972.0 3rd Qu.: 3.000 3rd Qu.: 1900
## Max. :3992.0 Max. :20.000 Max. :40425
## NA's :610 NA's :610 NA's :2
## product_length_cm product_height_cm product_width_cm
## Min. : 7.00 Min. : 2.00 Min. : 6.0
## 1st Qu.: 18.00 1st Qu.: 8.00 1st Qu.: 15.0
## Median : 25.00 Median : 13.00 Median : 20.0
## Mean : 30.82 Mean : 16.94 Mean : 23.2
## 3rd Qu.: 38.00 3rd Qu.: 21.00 3rd Qu.: 30.0
## Max. :105.00 Max. :105.00 Max. :118.0
## NA's :2 NA's :2 NA's :2
summary(geolocation_dataset)
## geolocation_zip_code_prefix geolocation_lat geolocation_lng
## Min. : 1001 Min. :-36.61 Min. :-101.47
## 1st Qu.:11075 1st Qu.:-23.60 1st Qu.: -48.57
## Median :26530 Median :-22.92 Median : -46.64
## Mean :36574 Mean :-21.18 Mean : -46.39
## 3rd Qu.:63504 3rd Qu.:-19.98 3rd Qu.: -43.77
## Max. :99990 Max. : 45.07 Max. : 121.11
## geolocation_city geolocation_state
## Length:1000163 Length:1000163
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summary(order_items_dataset)
## order_id order_item_id product_id seller_id
## Length:112650 Min. : 1.000 Length:112650 Length:112650
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 1.000 Mode :character Mode :character
## Mean : 1.198
## 3rd Qu.: 1.000
## Max. :21.000
## shipping_limit_date price freight_value
## Length:112650 Min. : 0.85 Min. : 0.00
## Class :character 1st Qu.: 39.90 1st Qu.: 13.08
## Mode :character Median : 74.99 Median : 16.26
## Mean : 120.65 Mean : 19.99
## 3rd Qu.: 134.90 3rd Qu.: 21.15
## Max. :6735.00 Max. :409.68
summary(order_reviews_dataset)
## review_id order_id review_score review_comment_title
## Length:99224 Length:99224 Min. :1.000 Length:99224
## Class :character Class :character 1st Qu.:4.000 Class :character
## Mode :character Mode :character Median :5.000 Mode :character
## Mean :4.086
## 3rd Qu.:5.000
## Max. :5.000
## review_comment_message review_creation_date review_answer_timestamp
## Length:99224 Length:99224 Length:99224
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
summary(sellers_dataset)
## seller_id seller_zip_code_prefix seller_city
## Length:3095 Min. : 1001 Length:3095
## Class :character 1st Qu.: 7094 Class :character
## Mode :character Median :14940 Mode :character
## Mean :32291
## 3rd Qu.:64553
## Max. :99730
## seller_state
## Length:3095
## Class :character
## Mode :character
##
##
##
summary(order_payments_dataset)
## order_id payment_sequential payment_type payment_installments
## Length:103886 Min. : 1.000 Length:103886 Min. : 0.000
## Class :character 1st Qu.: 1.000 Class :character 1st Qu.: 1.000
## Mode :character Median : 1.000 Mode :character Median : 1.000
## Mean : 1.093 Mean : 2.853
## 3rd Qu.: 1.000 3rd Qu.: 4.000
## Max. :29.000 Max. :24.000
## payment_value
## Min. : 0.00
## 1st Qu.: 56.79
## Median : 100.00
## Mean : 154.10
## 3rd Qu.: 171.84
## Max. :13664.08
summary(orders_dataset)
## order_id customer_id order_status
## Length:99441 Length:99441 Length:99441
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## order_purchase_timestamp order_approved_at order_delivered_carrier_date
## Length:99441 Length:99441 Length:99441
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## order_delivered_customer_date order_estimated_delivery_date
## Length:99441 Length:99441
## Class :character Class :character
## Mode :character Mode :character
# Handle Missing Values
# Drop rows with missing values
customers_dataset <- na.omit(customers_dataset)
products_dataset <- na.omit(products_dataset)
geolocation_dataset <- na.omit(geolocation_dataset)
order_items_dataset <- na.omit(order_items_dataset)
order_reviews_dataset <- na.omit(order_reviews_dataset)
sellers_dataset <- na.omit(sellers_dataset)
order_payments_dataset <- na.omit(order_payments_dataset)
orders_dataset <- na.omit(orders_dataset)
# Convert date columns to Date type
orders_dataset$order_purchase_timestamp <- as.Date(orders_dataset$order_purchase_timestamp)
orders_dataset$order_delivered_customer_date <- as.Date(orders_dataset$order_delivered_customer_date)
orders_dataset$order_approved_at <- as.Date(orders_dataset$order_approved_at)
orders_dataset$order_delivered_carrier_date <- as.Date(orders_dataset$order_delivered_carrier_date)
orders_dataset$order_estimated_delivery_date <- as.Date(orders_dataset$order_estimated_delivery_date)
2. Basic Analytics
library(ggplot2)
# Customers Dataset
# Distribution of customer zip codes
ggplot(customers_dataset, aes(x = customer_zip_code_prefix)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Distribution of Customer Zip Codes", x = "Zip Code Prefix", y = "Frequency")
# Count of customers by state
ggplot(customers_dataset, aes(x = customer_state)) +
geom_bar(fill = "green", color = "black") +
labs(title = "Number of Customers by State", x = "State", y = "Count")
# Products Dataset
# Distribution of product weights
ggplot(products_dataset, aes(x = product_weight_g)) +
geom_histogram(bins = 30, fill = "orange", color = "black") +
labs(title = "Distribution of Product Weights", x = "Weight (g)", y = "Frequency")
# Count of products by category
ggplot(products_dataset, aes(x = product_category_name)) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Number of Products by Category", x = "Product Category", y = "Count")
# Geolocation Dataset
# Distribution of latitude and longitude
ggplot(geolocation_dataset, aes(x = geolocation_lng, y = geolocation_lat)) +
geom_point(alpha = 0.5, color = "red") +
labs(title = "Geolocation of Customers", x = "Longitude", y = "Latitude")
# Order Items Dataset
# Distribution of prices
ggplot(order_items_dataset, aes(x = price)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Distribution of Order Prices", x = "Price", y = "Frequency")
# Count of items per order
ggplot(order_items_dataset, aes(x = order_id)) +
geom_bar(fill = "green", color = "black") +
labs(title = "Number of Items per Order", x = "Order ID", y = "Count")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Order Reviews Dataset
# Distribution of review scores
ggplot(order_reviews_dataset, aes(x = review_score)) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Distribution of Review Scores", x = "Review Score", y = "Count")
# Average review score per order
order_reviews_avg <- order_reviews_dataset %>%
group_by(order_id) %>%
summarise(avg_review_score = mean(review_score, na.rm = TRUE))
ggplot(order_reviews_avg, aes(x = avg_review_score)) +
geom_histogram(bins = 10, fill = "orange", color = "black") +
labs(title = "Average Review Score per Order", x = "Average Review Score", y = "Frequency")
# Sellers Dataset
# Count of sellers by state
ggplot(sellers_dataset, aes(x = seller_state)) +
geom_bar(fill = "blue", color = "black") +
labs(title = "Number of Sellers by State", x = "State", y = "Count")
# Order Payments Dataset
# Distribution of payment values
ggplot(order_payments_dataset, aes(x = payment_value)) +
geom_histogram(bins = 30, fill = "green", color = "black") +
labs(title = "Distribution of Payment Values", x = "Payment Value", y = "Frequency")
# Payment types
ggplot(order_payments_dataset, aes(x = payment_type)) +
geom_bar(fill = "purple", color = "black") +
labs(title = "Count of Payment Types", x = "Payment Type", y = "Count")
# Orders Dataset
# Distribution of order statuses
ggplot(orders_dataset, aes(x = order_status)) +
geom_bar(fill = "blue", color = "black") +
labs(title = "Count of Order Statuses", x = "Order Status", y = "Count")
# Delivery times
orders_dataset$delivery_time <- as.numeric(difftime(orders_dataset$order_delivered_customer_date, orders_dataset$order_purchase_timestamp, units = "days"))
ggplot(orders_dataset, aes(x = delivery_time)) +
geom_histogram(bins = 30, fill = "orange", color = "black") +
labs(title = "Distribution of Delivery Times", x = "Delivery Time (days)", y = "Frequency")
## Warning: Removed 2965 rows containing non-finite outside the scale range
## (`stat_bin()`).
3. Advanced Analytics
library(dplyr)
# Join datasets on order_id
merged_dataset_order <- orders_dataset %>%
left_join(order_items_dataset, by = "order_id")
merged_dataset_order
# Forecasting Sales
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Convert order_purchase_timestamp to Date type
merged_dataset_order$order_purchase_timestamp <- as.Date(merged_dataset_order$order_purchase_timestamp)
# Aggregate sales by month
monthly_sales <- merged_dataset_order %>%
group_by(month = floor_date(order_purchase_timestamp, "month")) %>%
summarise(total_sales = sum(price, na.rm = TRUE))
# Define the start date and frequency
start_date <- as.Date(paste0(year(min(monthly_sales$month)), "-", month(min(monthly_sales$month)), "-01"))
frequency <- 12 # Monthly data
# Create a time series object
sales_ts <- ts(monthly_sales$total_sales, start = c(year(start_date), month(start_date)), frequency = frequency)
# Apply Holt-Winters method
holt_winters_model <- HoltWinters(sales_ts)
# Forecast the next 12 months
forecast_values <- forecast(holt_winters_model, h = 12)
# Calculate end of the series
end_date <- seq.Date(from = start_date, by = "month", length.out = length(sales_ts))
# Generate future dates
forecast_dates <- seq.Date(from = end_date[length(end_date)] + months(1), by = "month", length.out = 12)
# Convert forecast to data frame for ggplot
forecast_df <- data.frame(
Date = forecast_dates,
Forecast = as.numeric(forecast_values$mean),
Lower_80 = as.numeric(forecast_values$lower[,1]),
Upper_80 = as.numeric(forecast_values$upper[,1]),
Lower_95 = as.numeric(forecast_values$lower[,2]),
Upper_95 = as.numeric(forecast_values$upper[,2])
)
forecast_df
# Plot the historical data and forecast
ggplot() +
geom_line(data = monthly_sales, aes(x = month, y = total_sales), color = "blue") +
geom_line(data = forecast_df, aes(x = Date, y = Forecast), color = "red") +
geom_ribbon(data = forecast_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), alpha = 0.2) +
labs(title = "Sales Forecast from 2017 using Holt-Winters Method", x = "Date", y = "Total Sales") +
theme_minimal()
# Geoanalysis (Using geolocation_dataset):
# install.packages("rnaturalearthdata")
library(rnaturalearth)
library(rnaturalearthdata)
##
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
geolocation_dataset$geolocation_lat <- as.numeric(geolocation_dataset$geolocation_lat)
geolocation_dataset$geolocation_lng <- as.numeric(geolocation_dataset$geolocation_lng)
# Convert to sf object
geolocation_sf <- st_as_sf(geolocation_dataset,
coords = c("geolocation_lng", "geolocation_lat"),
crs = 4326)
# Aggregate data
state_data <- geolocation_sf %>%
group_by(geolocation_state) %>%
summarise(geometry = st_union(geometry), count = n())
# Get the world map
world <- ne_countries(scale = "medium", returnclass = "sf")
# Plot the map with geolocation data
ggplot(data = world) +
geom_sf(fill = "lightgray") +
geom_sf(data = state_data, aes(size = count, color = count), alpha = 0.7) +
scale_color_viridis_c() +
labs(title = "Geolocation of Customers by State",
x = "Longitude",
y = "Latitude",
size = "Number of Customers",
color = "Number of Customers") +
theme_minimal()
# Outlier Detection (Using order_items_dataset):
# Boxplot to detect outliers
ggplot(order_items_dataset, aes(x = "", y = price)) +
geom_boxplot() +
labs(title = "Boxplot of Order Prices", y = "Price")
# Identifying outliers
order_items_dataset <- order_items_dataset %>%
mutate(is_outlier = price > quantile(price, 0.95, na.rm = TRUE))
# Visualize outliers
ggplot(order_items_dataset, aes(x = price, color = as.factor(is_outlier))) +
geom_histogram(bins = 30) +
labs(title = "Outlier Detection in Order Prices")
# RFM Analysis (Using orders_dataset):
library(cluster)
merged_dataset_order_RFM <- orders_dataset %>%
left_join(order_items_dataset, by = "order_id")
merged_dataset_order_RFM
# Compute RFM metrics
rfm_data <- merged_dataset_order_RFM %>%
group_by(customer_id) %>%
summarise(
Recency = as.numeric(difftime(Sys.Date(), max(order_purchase_timestamp), units = "days")),
Frequency = n(),
Monetary = sum(price, na.rm = TRUE)
)
# K-means clustering
set.seed(123)
rfm_clusters <- kmeans(rfm_data %>% select(Recency, Frequency, Monetary), centers = 4)
rfm_data$cluster <- rfm_clusters$cluster
# Visualize RFM clusters
ggplot(rfm_data, aes(x = Recency, y = Monetary, color = as.factor(cluster))) +
geom_point() +
labs(title = "RFM Clustering")
4. Generate a Model
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
library(ROSE) # For balancing the dataset
## Loaded ROSE 0.0-4
merged_dataset_order_generate <- orders_dataset %>%
left_join(order_items_dataset, by = "order_id")
# Convert columns to appropriate types
merged_dataset_order_generate <- merged_dataset_order_generate %>%
mutate(
order_purchase_timestamp = as.Date(order_purchase_timestamp),
order_approved_at = as.Date(order_approved_at),
price = as.numeric(price),
freight_value = as.numeric(freight_value),
order_status = as.factor(order_status)
)
# Remove rows with missing values or impute them
merged_dataset_order_generate <- na.omit(merged_dataset_order_generate)
# Balance the dataset
balanced_data <- ovun.sample(order_status ~ price + freight_value,
data = merged_dataset_order_generate,
method = "both", # Both oversampling and undersampling
p = 0.5, # Desired proportion
seed = 123)$data
# Split the balanced data
set.seed(123)
train_index <- createDataPartition(balanced_data$order_status, p = 0.7, list = FALSE)
train_data <- balanced_data[train_index, ]
test_data <- balanced_data[-train_index, ]
# Train Random Forest model
rf_model <- randomForest(order_status ~ price + freight_value, data = train_data)
# Make predictions
predictions <- predict(rf_model, test_data)
# Evaluate the model
confusionMatrix(predictions, test_data$order_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction delivered canceled
## delivered 16599 0
## canceled 11 16443
##
## Accuracy : 0.9997
## 95% CI : (0.9994, 0.9998)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9993
##
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 0.9993
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9993
## Prevalence : 0.5025
## Detection Rate : 0.5022
## Detection Prevalence : 0.5022
## Balanced Accuracy : 0.9997
##
## 'Positive' Class : delivered
##
according to the above result, Sensitivity and Positive Predictive Value for the “delivered” class are very high, but the model struggles with predicting other classes